Graphical inspection
plotDispEsts(dds, main="Dispersion plot")

rld <- rlogTransformation(dds)
head(assay(rld))
## SRR24750870 SRR24750871 SRR24750872 SRR24750873 SRR24750874
## ENSG00000000003.16 5.917535 5.828125 5.665265 5.953147 5.928074
## ENSG00000000005.6 2.760746 1.814801 1.738242 1.886832 1.670663
## ENSG00000000419.14 8.313336 8.446128 8.476905 8.570571 8.660874
## ENSG00000000457.14 7.309331 7.472447 7.559360 7.837606 7.284328
## ENSG00000000460.17 5.660049 5.066636 4.927018 5.000241 5.538527
## ENSG00000000938.13 5.033313 5.381299 5.690622 5.700456 6.275428
## SRR24750875 SRR24750879 SRR24750883 SRR24750884 SRR24750885
## ENSG00000000003.16 5.859484 6.058560 6.274450 6.327279 5.819210
## ENSG00000000005.6 2.000234 1.868700 1.684107 1.670603 1.666329
## ENSG00000000419.14 9.015589 8.604115 8.960194 8.722805 8.868794
## ENSG00000000457.14 7.747022 7.541706 7.860924 7.970687 7.769912
## ENSG00000000460.17 5.466104 5.270120 5.597530 5.555343 5.680307
## ENSG00000000938.13 5.849431 5.102436 5.775555 5.117404 5.248318
## SRR24750886 SRR24750887 SRR24750888 SRR24750889 SRR24750890
## ENSG00000000003.16 5.791106 5.761190 5.834337 5.571565 6.064573
## ENSG00000000005.6 2.113572 1.953646 1.857564 2.033945 1.915987
## ENSG00000000419.14 8.927451 9.157233 8.872940 9.216153 8.461380
## ENSG00000000457.14 7.612505 7.582885 7.737253 7.941017 7.404776
## ENSG00000000460.17 5.207095 5.674224 5.690186 5.621894 5.153109
## ENSG00000000938.13 7.404563 4.785715 5.036387 5.296468 5.381870
## SRR24750891 SRR24750892 SRR24750893 SRR24750894 SRR24750895
## ENSG00000000003.16 5.739619 6.028900 6.300341 5.637114 6.145888
## ENSG00000000005.6 1.740251 1.679606 1.960789 1.677105 1.733698
## ENSG00000000419.14 9.047236 8.609687 8.373867 8.631808 8.417076
## ENSG00000000457.14 7.521993 7.699256 7.608436 7.670385 7.539165
## ENSG00000000460.17 5.758669 5.720892 5.307104 5.467281 5.823575
## ENSG00000000938.13 4.835364 4.839947 5.439222 4.677890 5.477796
## SRR24750896 SRR24750897 SRR24750898 SRR24750899 SRR24750900
## ENSG00000000003.16 5.868366 6.079529 5.775011 6.174295 6.176189
## ENSG00000000005.6 1.670717 2.351702 1.894241 1.921822 1.871424
## ENSG00000000419.14 8.587082 8.348925 8.674711 8.638459 8.338870
## ENSG00000000457.14 7.623633 7.465805 8.003973 7.633382 7.497142
## ENSG00000000460.17 5.784000 5.611291 5.932256 4.968874 6.281634
## ENSG00000000938.13 5.676657 5.907561 6.096818 5.774717 4.993957
## SRR24750901 SRR24750902
## ENSG00000000003.16 6.045202 5.708710
## ENSG00000000005.6 1.672514 3.226686
## ENSG00000000419.14 8.409216 8.406539
## ENSG00000000457.14 7.601665 7.734756
## ENSG00000000460.17 5.482304 5.276328
## ENSG00000000938.13 5.124617 5.336447
mycols <- brewer.pal(8, "Dark2")[1:length(unique(samples_healthy$tissue))]
mycols2 <- brewer.pal(8, "Set3")[1:length(unique(samples_healthy$sex))]
sampleDists <- as.matrix(dist(t(assay(rld))))
heatmap.2(as.matrix(sampleDists), key=F, trace="none",
col=colorpanel(100, "black", "white"),
ColSideColors=mycols[samples_healthy$tissue],
RowSideColors=mycols2[samples_healthy$sex],
margin=c(10, 10), main="Sample Distance Matrix")
legend(x=-0.03,y=1.1, legend = levels(samples_healthy$tissue), fill = mycols, title = "Categories",bty='n')
legend(x=0.07,y=1.1, legend = levels(samples_healthy$sex), fill = mycols2, title = "Categories",bty='n')

df <- as.data.frame(colData(dds)[,c("tissue", "sex")])
pheatmap(as.matrix(sampleDists), annotation_col=df, annotation_row = df)

selected <- order(rowMeans(counts(dds,normalized=TRUE)),
decreasing=TRUE)[1:20]
df <- as.data.frame(colData(dds)[,c("tissue", "sex")])
pheatmap(assay(rld)[selected,], cluster_rows=FALSE, show_rownames=TRUE,
cluster_cols=FALSE, annotation_col=df)

DESeq2::plotPCA(rld, intgroup=c("tissue"))
## using ntop=500 top features by variance

pcaData <- plotPCA(rld, intgroup=c("tissue",'sex'), returnData=TRUE)
## using ntop=500 top features by variance
pcaData$isolate <- rld@colData@listData$isolate
percentVar <- round(100 * attr(pcaData, "percentVar"))
ggplot(pcaData, aes(PC1, PC2, color=tissue, shape=sex, label= str_sub(isolate, -2, -1))) +
geom_point(size=3) +
geom_path(aes(group = str_sub(isolate, -2, -1))) +
geom_text_repel(aes(label=str_sub(isolate, -2, -1)), size=3) +
xlab(paste0("PC1: ", percentVar[1], "% variance")) +
ylab(paste0("PC2: ", percentVar[2], "% variance")) +
coord_fixed() +
theme_minimal()

# hist(res$pvalue, breaks=50, col="grey")
# DESeq2::plotMA(dds, ylim=c(-1,1))
#
# # Volcano plot
# with(res, plot(log2FoldChange, -log10(pvalue), pch=20, main="Volcano plot", xlim=c(-2.5,2)))
# with(subset(res, padj<.05 ), points(log2FoldChange, -log10(pvalue), pch=20, col="red"))
fig <- plot_ly(x = res$pvalue, type = "histogram")
fig <- fig %>% layout(title = "Histogram of p-values",
xaxis = list(title = "Value"),
yaxis = list(title = "Frequency"))
fig
easyMAplot(res, output_shiny=FALSE)
ensembl <- useEnsembl(biomart = "genes", dataset = "hsapiens_gene_ensembl")
## Ensembl site unresponsive, trying useast mirror
res <- res[order(-abs(res$log2FoldChange)), ]
resDF <- as.data.frame(res)
resFiltered <- resDF %>% dplyr::filter(padj<0.05)
universe <- getBM(c("ensembl_gene_id","hgnc_symbol","entrezgene_id","description","entrezgene_description"), "ensembl_gene_id_version", tx2gene$GENEID, ensembl)
resFiltered$ensembl_gene_id <- gsub("\\.\\d+$", "", row.names(resFiltered))
gene_info <- getBM(c("ensembl_gene_id","hgnc_symbol","entrezgene_id","description","entrezgene_description"), "ensembl_gene_id", resFiltered$ensembl_gene_id, ensembl) |> group_by(ensembl_gene_id) |> dplyr::slice_head(n = 1) |> ungroup()
resFiltered <- resFiltered %>%
left_join(gene_info, by = "ensembl_gene_id")
top_peaks <- as.data.frame(head(resFiltered,10))
a <- list()
for (i in seq_len(nrow(top_peaks))) {
m <- top_peaks[i, ]
annot = ifelse(m[["hgnc_symbol"]]=="", "novel", m[["hgnc_symbol"]])
ax <- sample(c(-1, 1), 1) * runif(1, 5, 60)
ay <- sample(c(-1, 1), 1) * runif(1, 5, 40)
a[[i]] <- list(
x = m[["log2FoldChange"]],
y = -log10(m[["pvalue"]]),
text = annot,
xref = "x",
yref = "y",
showarrow = TRUE,
arrowhead = 0.5,
ax = ax,
ay = ay
)
}
easyVolcano(res, fccut = 2, output_shiny=FALSE) %>% layout(annotations = a)
data(kegg.sets.hs)
data(sigmet.idx.hs)
kegg.sets.hs <- kegg.sets.hs[sigmet.idx.hs]
foldchanges <- resFiltered$log2FoldChange
names(foldchanges) <- resFiltered$entrezgene_id
keggres <- gage(foldchanges, gsets=kegg.sets.hs, same.dir=FALSE)
lapply(keggres, head)
## $greater
## p.geomean stat.mean p.val
## hsa04350 TGF-beta signaling pathway 0.1768113 0.9381318 0.1768113
## hsa04973 Carbohydrate digestion and absorption 0.2252355 0.7681066 0.2252355
## hsa00564 Glycerophospholipid metabolism 0.2594676 0.6578153 0.2594676
## hsa04020 Calcium signaling pathway 0.2710911 0.6117936 0.2710911
## hsa04530 Tight junction 0.2840787 0.5733486 0.2840787
## hsa04610 Complement and coagulation cascades 0.3597654 0.3664711 0.3597654
## q.val set.size exp1
## hsa04350 TGF-beta signaling pathway 0.9999842 22 0.1768113
## hsa04973 Carbohydrate digestion and absorption 0.9999842 13 0.2252355
## hsa00564 Glycerophospholipid metabolism 0.9999842 12 0.2594676
## hsa04020 Calcium signaling pathway 0.9999842 48 0.2710911
## hsa04530 Tight junction 0.9999842 38 0.2840787
## hsa04610 Complement and coagulation cascades 0.9999842 10 0.3597654
##
## $stats
## stat.mean exp1
## hsa04350 TGF-beta signaling pathway 0.9381318 0.9381318
## hsa04973 Carbohydrate digestion and absorption 0.7681066 0.7681066
## hsa00564 Glycerophospholipid metabolism 0.6578153 0.6578153
## hsa04020 Calcium signaling pathway 0.6117936 0.6117936
## hsa04530 Tight junction 0.5733486 0.5733486
## hsa04610 Complement and coagulation cascades 0.3664711 0.3664711
keggrespathwaysUp <- data.frame(id=rownames(keggres$greater), keggres$greater) %>%
tibble::as_tibble() %>%
dplyr::filter(row_number()<=6) %>%
.$id %>%
as.character()
keggrespathwaysUp
## [1] "hsa04350 TGF-beta signaling pathway"
## [2] "hsa04973 Carbohydrate digestion and absorption"
## [3] "hsa00564 Glycerophospholipid metabolism"
## [4] "hsa04020 Calcium signaling pathway"
## [5] "hsa04530 Tight junction"
## [6] "hsa04610 Complement and coagulation cascades"
keggresids <- substr(c(keggrespathwaysUp), start=1, stop=8)
keggresids
## [1] "hsa04350" "hsa04973" "hsa00564" "hsa04020" "hsa04530" "hsa04610"
plot_pathway <- function(pid) pathview(gene.data=foldchanges, pathway.id=pid, species="hsa", new.signature=FALSE)
#detach("package:dplyr", unload=T)
tmp <- sapply(keggresids, function(pid) pathview(gene.data=foldchanges, pathway.id=pid, species="hsa"))
## 'select()' returned 1:1 mapping between keys and columns
## Info: Working in directory D:/VLTA
## Info: Writing image file hsa04350.pathview.png
## 'select()' returned 1:1 mapping between keys and columns
## Info: Working in directory D:/VLTA
## Info: Writing image file hsa04973.pathview.png
## 'select()' returned 1:1 mapping between keys and columns
## Info: Working in directory D:/VLTA
## Info: Writing image file hsa00564.pathview.png
## 'select()' returned 1:1 mapping between keys and columns
## Info: Working in directory D:/VLTA
## Info: Writing image file hsa04020.pathview.png
## 'select()' returned 1:1 mapping between keys and columns
## Info: Working in directory D:/VLTA
## Info: Writing image file hsa04530.pathview.png
## 'select()' returned 1:1 mapping between keys and columns
## Info: Working in directory D:/VLTA
## Info: Writing image file hsa04610.pathview.png
resDF <- resDF %>% mutate(diffexpressed = dplyr::case_when(
log2FoldChange > 0 & padj < 0.05 ~ 'UP',
log2FoldChange < 0 & padj < 0.05 ~ 'DOWN',
padj > 0.05 ~ 'NO'
))
original_gene_list <- resDF$log2FoldChange
names(original_gene_list) <- gsub("\\.\\d+$", "", row.names(resDF))
gene_list<-na.omit(original_gene_list)
gene_list = sort(gene_list, decreasing = TRUE)
gse <- gseGO(geneList=gene_list,
ont ="ALL",
keyType = "ENSEMBL",
minGSSize = 3,
maxGSSize = 800,
pvalueCutoff = 0.05,
verbose = TRUE,
OrgDb = org.Hs.eg.db,
pAdjustMethod = "none")
## using 'fgsea' for GSEA analysis, please cite Korotkevich et al (2019).
## preparing geneSet collections...
## GSEA analysis...
## Warning in preparePathwaysAndStats(pathways, stats, minSize, maxSize, gseaParam, : There are ties in the preranked stats (0% of the list).
## The order of those tied genes will be arbitrary, which may produce unexpected results.
## Warning in fgseaMultilevel(pathways = pathways, stats = stats, minSize =
## minSize, : For some pathways, in reality P-values are less than 1e-10. You can
## set the `eps` argument to zero for better estimation.
## leading edge analysis...
## done...
dotplot(gse, showCategory=10, font.size=7, split=".sign") + facet_grid(.~.sign)

#emapplot(gse, showCategory = 10)
websiteLive <- getOption("enrichR.live")
if (websiteLive) {
listEnrichrSites()
setEnrichrSite("Enrichr") # Human genes
}
## Enrichr ... Connection is Live!
## FlyEnrichr ... Connection is Live!
## WormEnrichr ... Connection is Live!
## YeastEnrichr ... Connection is Live!
## FishEnrichr ... Connection is Live!
## OxEnrichr ... Connection is Live!
## Connection changed to https://maayanlab.cloud/Enrichr/
## Connection is Live!
if (websiteLive) dbs <- listEnrichrDbs()
dbs <- c("GO_Molecular_Function_2023", "GO_Cellular_Component_2023", "GO_Biological_Process_2023","GO_Molecular_Function_2018", "GO_Cellular_Component_2018", "GO_Biological_Process_2018")
if (websiteLive) {
enriched <- enrichr(names(gene_list), dbs)
}
## Uploading data to Enrichr... Done.
## Querying GO_Molecular_Function_2023... Done.
## Querying GO_Cellular_Component_2023... Done.
## Querying GO_Biological_Process_2023... Done.
## Querying GO_Molecular_Function_2018... Done.
## Querying GO_Cellular_Component_2018... Done.
## Querying GO_Biological_Process_2018... Done.
## Parsing results... Done.
df <- resDF |> tibble::rownames_to_column("ensembl_gene_id") |> dplyr::filter(padj<0.05)
up_regulated <- df |> dplyr::filter(log2FoldChange > 0) |> dplyr::mutate(ensembl_gene_id = gsub("\\.\\d+$", "", ensembl_gene_id)) |> dplyr::select(ensembl_gene_id)
write.table(up_regulated, 'up_ta_vl.txt', row.names = FALSE, col.names = FALSE, quote = FALSE)
down_regulated <- df |> dplyr::filter(log2FoldChange < 0) |> dplyr::mutate(ensembl_gene_id = gsub("\\.\\d+$", "", ensembl_gene_id)) |> dplyr::select(ensembl_gene_id)
write.table(down_regulated, 'down_ta_vl.txt', row.names = FALSE, col.names = FALSE, quote = FALSE)
# Choose databases
dbs <- c("GO_Biological_Process_2021", "KEGG_2021_Human", "WikiPathways_2019_Human", "Jensen_DISEASES")
genes <- names(gene_list)
# Run enrichment analysis
enriched_results <- enrichr(genes, dbs)
## Uploading data to Enrichr... Done.
## Querying GO_Biological_Process_2021... Done.
## Querying KEGG_2021_Human... Done.
## Querying WikiPathways_2019_Human... Done.
## Querying Jensen_DISEASES... Done.
## Parsing results... Done.
# Access results for a specific database
go_results <- enriched_results[["GO_Biological_Process_2021"]]
kegg_results <- enriched_results[["KEGG_2021_Human"]]
head(kegg_results)
head(go_results)
genes <- names(gene_list)
unvierse1 <- gsub("\\.\\d+$", "", row.names(resDF))
unvierse2 <- getBM(c("ensembl_gene_id","hgnc_symbol","entrezgene_id","description","entrezgene_description"), "ensembl_gene_id", unvierse1, ensembl) |> group_by(ensembl_gene_id) |> dplyr::slice_head(n = 1) |> ungroup()
universe3 <- unvierse2$entrezgene_id
ego <- enrichGO(gene = as.character(gene_info$entrezgene_id),
universe = as.character(universe3),
OrgDb = org.Hs.eg.db,
ont = "BP",
pAdjustMethod = "BH",
pvalueCutoff = 0.05,
qvalueCutoff = 0.05,
readable = TRUE)
head(ego)
egoDF <- ego@result
write.table(egoDF, 'BP_TA_VL.csv')
egoCC <- enrichGO(gene = as.character(gene_info$entrezgene_id),
#universe = as.character(universe$entrezgene_id),
OrgDb = org.Hs.eg.db,
ont = "CC",
pAdjustMethod = "BH",
pvalueCutoff = 0.05,
qvalueCutoff = 0.1,
readable = TRUE)
egoCCDF <- egoCC@result
write.table(egoCCDF, 'CC_TA_VL.csv')
ggo <- groupGO(gene = as.character(gene_info$entrezgene_id),
OrgDb = org.Hs.eg.db,
ont = "BP",
level = 4,
readable = TRUE)
head(ggo)
gggo <- ggo@result
w <- egoCCDF %>% filter(pvalue < 0.05) %>% select(c("ID","pvalue"))
write.table(w, "table2.tsv",quote = FALSE, row.names = FALSE)